Setup

Assign variables

date <- Sys.Date()

in_dir <- file.path("..", "results/integrated_dfs")

out_dir <- file.path("..", "results/expression_output")

options(dplyr.summarise.inform = FALSE)

Import files

d.DeKegel_top5_pairs_drug_sens_expr_annot <- read_rds(file.path(in_dir, "d.DeKegel_top5_pairs_drug_sens_expr_annot"))

d.drug_sens_all_genes_expr <- read_rds(file.path(in_dir, "d.drug_sens_all_genes_expr"))

Reformat DFs

## keep only relevant columns
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr_annot %>%
  dplyr::select(-c(entrez_pair:prediction_rank, prediction_score:family_size,
                   broad_id, primary_disease, subtype_disease))

d.drug_sens_all_genes_expr <- d.drug_sens_all_genes_expr %>%
  rename(log_tpm = rna_expression)

## how many gene pairs are included in the paralog dataset?
d.DeKegel_top5_pairs_drug_sens_expr %>%
  distinct(sorted_gene_pair) %>%
  nrow()
## [1] 367

Import functions

source("shared_functions.R")

Analysis

Single-gene

This analysis is intended to demonstrate expected results for targeted inhibitors and their target (single) genes.

PRISM Phase 1 drug sensitivity

Continuous/linear regression

## running lm for each sorted_gene_pair/compound_id group

## use broom::glance to get rsquared and pval
d.drug_sens_all_genes_expr_lm_summary <- d.drug_sens_all_genes_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::glance(lm(dependency ~ log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  rename(r_squared = r.squared, p_val = p.value)
# d.drug_sens_all_genes_expr_lm_summary

## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_lm_summary)
# d.stats

d.drug_sens_all_genes_expr_lm_summary <- d.stats %>%
  dplyr::select(-p_val) %>%
  right_join(d.drug_sens_all_genes_expr_lm_summary, by = c("target", "compound_id")) %>%
  dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.drug_sens_all_genes_expr_lm_summary

## use broom::tidy to get slope
d.drug_sens_all_genes_expr_lm_slope <- d.drug_sens_all_genes_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::tidy(lm(dependency ~ log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  ## extract slope of lm fit line from output
  filter(term == "log_tpm") %>%
  rename(slope = estimate) %>%
  dplyr::select(target, compound_id, slope) 
# d.drug_sens_all_genes_expr_lm_slope

## add rsquared, p-val, fdr, and slope info to main DF
d.drug_sens_all_genes_expr_lm <- d.drug_sens_all_genes_expr %>%
  left_join(d.drug_sens_all_genes_expr_lm_summary, by = c("target", "compound_id")) %>%
  left_join(d.drug_sens_all_genes_expr_lm_slope, by = c("target", "compound_id")) %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(log_tpm))

summarize_stats(d.drug_sens_all_genes_expr_lm)
## df: d.drug_sens_all_genes_expr_lm 
## # target/compound pairs with
## p < 0.05: 1236 
## p < 0.01: 565 
## FDR < 0.1: 369 
## FDR < 0.05: 275
## number of pairs with negative vs. positive slope
d.drug_sens_all_genes_expr_lm %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
  group_by(slope_flag) %>%
  summarize(n = n()) %>%
  print_kbl()
slope_flag n
negative 5329
positive 5614
## get low expression = low LFC pairs (positive slope)
d.drug_sens_all_genes_expr_lm_pos_slope <-d.drug_sens_all_genes_expr_lm %>%
  filter(slope > 0)

## get high expression = low LFC pairs (negative slope)
d.drug_sens_all_genes_expr_lm_neg_slope <- d.drug_sens_all_genes_expr_lm %>%
  filter(slope < 0)

## summarize significant drug/gene combos for each new DF
summarize_stats(d.drug_sens_all_genes_expr_lm_pos_slope)
## df: d.drug_sens_all_genes_expr_lm_pos_slope 
## # target/compound pairs with
## p < 0.05: 541 
## p < 0.01: 194 
## FDR < 0.1: 105 
## FDR < 0.05: 74
summarize_stats(d.drug_sens_all_genes_expr_lm_neg_slope)
## df: d.drug_sens_all_genes_expr_lm_neg_slope 
## # target/compound pairs with
## p < 0.05: 695 
## p < 0.01: 371 
## FDR < 0.1: 264 
## FDR < 0.05: 201
Plot
example genes
d.top_neg_hits <- d.drug_sens_all_genes_expr_lm_neg_slope %>% 
  unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>% 
  arrange(fdr) %>%
  filter(fdr < 0.05) %>%
  distinct(target_compound_label)
print_kbl(head(d.top_neg_hits, n = 20))
target_compound_label
MDM2 idasanutlin
MDM2 CGM097
MDM2 AMG-232
ERBB2 GW-583340
ERBB2 AST-1306
PDE3A anagrelide
ERBB2 lapatinib
FGFR2 AEE788
PDGFRA lucitanib
MDM2 nutlin-3
ERBB2 tucatinib
FGFR1 erdafitinib
ERBB2 AZD8931
ERBB2 BMS-690514
PDGFRA masitinib
ERBB2 PD-168393
PDGFRA pazopanib
EGFR PD-153035
FGFR3 AEE788
EGFR BMS-690514
make_lm_plot <- function(df, plot_list, x_val){
  d.plot <- df %>%
    unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
    filter(target_compound_label %in% plot_list) %>%
    mutate("target_compound_label" = reorder(target_compound_label, fdr))
  
  d.label <- d.plot %>%
    group_by(target, compound_id) %>%
    mutate(group_label_x = min({{x_val}}),
           group_label_y = max(dependency)) %>%
    dplyr::ungroup() %>%
    dplyr::select(target_compound_label, r_squared, p_val, fdr, group_label_x, group_label_y) %>%
    distinct(target_compound_label, .keep_all = TRUE) %>%
    mutate(r_squared = round(r_squared, 3),
           p_val = scientific(p_val, 3),
           fdr = scientific(fdr, 3)) 
  # d.label
    
  plot <- ggplot(d.plot, aes(x = {{x_val}}, y = dependency)) +
    geom_hline(yintercept = 0) +
    geom_point(size = 1) +
    geom_smooth(method = lm, se = FALSE, color = "gray60") +
    geom_richtext(data = d.label,
                  aes(x = group_label_x, y = group_label_y,
                  ## this package can do HTML-formatting for text
                  label = paste0("r<sup>2</sup>=", r_squared, "<br>", 
                                 "FDR =", fdr)),
                  ## adjust label alignment and remove borders
                  hjust = 0, vjust = 1, label.color = NA, size = 3,
                  ## label background = transparent, text is not
                  fill = alpha(c("white"), 0.75),
                  label.margin = unit(c(0, 0, 0, 0), "lines"),
                  label.padding = unit(c(0, 0, 0, 0), "lines")) +
    labs(x = "log2(TPM)", y = "Cell_viability") +
    theme_bw() +
    theme(aspect.ratio = 1,
          axis.text = element_text(color = "black"),
          axis.ticks = element_line(color = "black")) +
    facet_wrap(~target_compound_label, scales = "free", ncol = 3)
  
  return(plot)
}
plot_list <- c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035")


selected_top_lm_hits <- make_lm_plot(d.drug_sens_all_genes_expr_lm_neg_slope, plot_list, log_tpm)
selected_top_lm_hits

d.drug_sens_all_genes_expr_lm_neg_slope_topHits <- d.drug_sens_all_genes_expr_lm_neg_slope %>% 
  filter(fdr < 0.05) %>%
  arrange(fdr) %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  dplyr::select(target, compound_name, slope, r_squared:fdr, moa)
print_kbl(head(d.drug_sens_all_genes_expr_lm_neg_slope_topHits))
target compound_name slope r_squared p_val fdr moa
MDM2 idasanutlin -0.6634255 0.2571951 0 0 MDM inhibitor
MDM2 CGM097 -0.5986578 0.2523123 0 0 MDM inhibitor
MDM2 AMG-232 -0.5354975 0.2197390 0 0 MDM inhibitor
ERBB2 GW-583340 -0.1913803 0.1504336 0 0 EGFR inhibitor
ERBB2 AST-1306 -0.1943274 0.1445403 0 0 EGFR inhibitor
PDE3A anagrelide -0.2086442 0.1393173 0 0 phosphodiesterase inhibitor
d.drug_sens_all_genes_expr_lm_pos_slope_fdr_topHits <- d.drug_sens_all_genes_expr_lm_pos_slope %>% 
  filter(fdr < 0.05) %>%
  arrange(fdr) %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  dplyr::select(target, compound_name, slope, r_squared:fdr, moa)
print_kbl(head(d.drug_sens_all_genes_expr_lm_pos_slope_fdr_topHits))
target compound_name slope r_squared p_val fdr moa
AKT3 GSK2110183 0.1494385 0.0982682 0 0.0e+00 AKT inhibitor
AKT3 MK-2206 0.1194117 0.0873418 0 0.0e+00 AKT inhibitor
NQO1 menadione 0.1318066 0.0801592 0 0.0e+00 mitochondrial DNA polymerase inhibitor, phosphatase inhibitor
PIK3CD alpelisib 0.1273025 0.0629412 0 4.0e-07 PI3K inhibitor
AKT3 GDC-0068 0.1148292 0.0607472 0 7.0e-07 AKT inhibitor
ATP1A1 k-strophanthidin 0.3351501 0.0587694 0 1.6e-06 ATPase inhibitor

Z-scaled outliers

Since many target genes do not show a truly linear pattern of expression vs. cell viability, and in some cases the linear regression fit line seemed to be strongly influenced by outliers, we decided to take another approach to model these data. We instead used an outlier-based approach to group cell lines into low, high, and “neither” expression categories.

## robustly scale expression data
d.drug_sens_all_genes_expr_outlier <- d.drug_sens_all_genes_expr %>%
  filter(!is.na(log_tpm)) %>%
  filter(!is.na(dependency)) %>%
  group_by(target, compound_id) %>% 
  mutate(scaled_log_tpm = RobScale(log_tpm, center = TRUE, scale = TRUE)) %>%
  mutate(expr_outlier_flag = case_when(
    scaled_log_tpm < -2 ~ "low",
    scaled_log_tpm > 2 ~ "high", 
    TRUE ~ "neither")) %>%
  mutate(expr_low_flag = case_when(
    scaled_log_tpm < -2 ~ TRUE,
    TRUE ~ FALSE)) %>% ## if expression ≥ -2, classify as "not low"
  ungroup() %>%
  mutate(expr_outlier_flag = factor(expr_outlier_flag, levels = c("low", "neither", "high"))) 

## how many target/compound pairs have no expression outliers?
d.drug_sens_all_genes_expr_outlier %>%
  group_by(target, compound_id) %>%
  distinct(expr_low_flag) %>%
  summarize(n = n()) %>%
  filter(n < 2) %>%
  nrow() %>%
  print_kbl()
x
8236
Split into positive and negative outliers
d.drug_sens_all_genes_expr_outlier %>%
  group_by(target, compound_id, expr_outlier_flag) %>%
  summarize(n = n()) %>%
  group_by(expr_outlier_flag) %>%
  summarize(mean = mean(n),
            median = median(n)) %>%
  print_kbl()
expr_outlier_flag mean median
low 24.69228 21
neither 451.99305 455
high 95.56378 99
## how many compounds/targets have 0 high or low outlier cell lines?
d.drug_sens_all_genes_expr_outlier %>%
  group_by(target, compound_id, expr_outlier_flag) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  group_by(target, compound_id) %>%
  summarize(n = n()) %>%
  filter(n < 3) %>%
  nrow() %>%
  print_kbl()
x
8276
## low vs. neither (with high included but ignoring it) => 
## calculate diff in median LFC between relevant groups

d.drug_sens_all_genes_expr_outlier_low <- d.drug_sens_all_genes_expr_outlier %>%
  group_by(target, compound_id) %>%
  ## get rid of drug/targets that don't have any "low" cell lines
  filter(any(expr_outlier_flag == "low")) %>%
  ungroup() 

d.drug_sens_all_genes_expr_outlier_low_grp_med_diff <- d.drug_sens_all_genes_expr_outlier_low %>%
  group_by(target, compound_id, expr_outlier_flag) %>%
  ## calculate median dependency score for each outlier group
  summarize(grp_med_dependency = median(dependency)) %>%
  pivot_wider(names_from = expr_outlier_flag, values_from = grp_med_dependency) %>%
  ## calculate difference between low and neither groups
  mutate(grp_med_diff = low - neither) %>%
  dplyr::select(target, compound_id, grp_med_diff) 

d.drug_sens_all_genes_expr_outlier_low <- d.drug_sens_all_genes_expr_outlier_low %>%
  left_join(d.drug_sens_all_genes_expr_outlier_low_grp_med_diff, by = c("target", "compound_id"))

## high vs. neither (with low included but ignoring it) => 
## calculate diff in median LFC between relevant groups =>
d.drug_sens_all_genes_expr_outlier_high <- d.drug_sens_all_genes_expr_outlier %>%
  group_by(target, compound_id) %>%
  ## get rid of drug/targets that don't have any "high" cell lines
  filter(any(expr_outlier_flag == "high")) %>%
  ungroup() 

d.drug_sens_all_genes_expr_outlier_high_grp_med_diff <- d.drug_sens_all_genes_expr_outlier_high %>%
  group_by(target, compound_id, expr_outlier_flag) %>%
  ## calculate median dependency score for each outlier group
  summarize(grp_med_dependency = median(dependency)) %>%
  pivot_wider(names_from = expr_outlier_flag, values_from = grp_med_dependency) %>%
  ## calculate difference between low and neither groups
  mutate(grp_med_diff = high - neither) %>%
  dplyr::select(target, compound_id, grp_med_diff) 

d.drug_sens_all_genes_expr_outlier_high <- d.drug_sens_all_genes_expr_outlier_high %>%
  left_join(d.drug_sens_all_genes_expr_outlier_high_grp_med_diff, by = c("target", "compound_id"))
## low outliers and drug sensitivity
## calculate p-value (adjusted for multiple within-group tests?)
d.drug_sens_all_genes_expr_outlier_low_wrst_pvals <- d.drug_sens_all_genes_expr_outlier_low %>%
  ## remove cell lines that are high outliers 
  filter(expr_outlier_flag != "high") %>%
  group_by(target, compound_id) %>%
  summarize(p_val = wilcox.test(dependency ~ expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_low_wrst_sens_pvals

## adjust for multiple testing overall
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_outlier_low_wrst_pvals)
## Adding missing grouping variables: `target`
# d.stats

d.drug_sens_all_genes_expr_outlier_low_wrst <- d.drug_sens_all_genes_expr_outlier_low %>%
  left_join(d.stats, by = c("target", "compound_id")) 

d.drug_sens_all_genes_expr_outlier_low_wrst_sens <- d.drug_sens_all_genes_expr_outlier_low_wrst %>%
  filter(grp_med_diff < 0)

d.drug_sens_all_genes_expr_outlier_low_wrst_resist <- d.drug_sens_all_genes_expr_outlier_low_wrst %>%
  filter(grp_med_diff > 0)

# d.drug_sens_all_genes_expr_outlier_low_wrst_resist

## high outliers and drug sensitivity
d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals <- d.drug_sens_all_genes_expr_outlier_high %>%
  ## remove cell lines that are low outliers 
  filter(expr_outlier_flag != "low") %>%
  group_by(target, compound_id) %>%
  summarize(p_val = wilcox.test(dependency ~ expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals

## adjust for multiple testing overall
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals)
## Adding missing grouping variables: `target`
# d.stats

d.drug_sens_all_genes_expr_outlier_high_wrst <- d.drug_sens_all_genes_expr_outlier_high %>%
  left_join(d.stats, by = c("target", "compound_id"))
# d.drug_sens_all_genes_expr_outlier_high_wrst_sens

## high outliers and drug sensitivity
d.drug_sens_all_genes_expr_outlier_high_wrst_sens <- d.drug_sens_all_genes_expr_outlier_high_wrst %>%
  ## remove cell lines where high outliers don't have a higher median dependency score than the other groups
  filter(grp_med_diff < 0)

d.drug_sens_all_genes_expr_outlier_high_wrst_resist <- d.drug_sens_all_genes_expr_outlier_high_wrst %>%
  ## remove cell lines where low outliers don't have a lower median dependency score than the other groups
  filter(grp_med_diff > 0)
## low outliers
summarize_stats(d.drug_sens_all_genes_expr_outlier_low_wrst_sens)
## df: d.drug_sens_all_genes_expr_outlier_low_wrst_sens 
## # target/compound pairs with
## p < 0.05: 89 
## p < 0.01: 31 
## FDR < 0.1: 4 
## FDR < 0.05: 3
summarize_stats(d.drug_sens_all_genes_expr_outlier_low_wrst_resist)
## df: d.drug_sens_all_genes_expr_outlier_low_wrst_resist 
## # target/compound pairs with
## p < 0.05: 189 
## p < 0.01: 70 
## FDR < 0.1: 26 
## FDR < 0.05: 21
## high outliers
summarize_stats(d.drug_sens_all_genes_expr_outlier_high_wrst_sens)
## df: d.drug_sens_all_genes_expr_outlier_high_wrst_sens 
## # target/compound pairs with
## p < 0.05: 423 
## p < 0.01: 145 
## FDR < 0.1: 33 
## FDR < 0.05: 24
summarize_stats(d.drug_sens_all_genes_expr_outlier_high_wrst_resist)
## df: d.drug_sens_all_genes_expr_outlier_high_wrst_resist 
## # target/compound pairs with
## p < 0.05: 417 
## p < 0.01: 101 
## FDR < 0.1: 5 
## FDR < 0.05: 5
Plot
make_outlier_plot <- function(df, plot_list, direction, outlier_var, expr_var){

  d.plot <- df %>%
    unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
    filter(target_compound_label %in% plot_list) %>%
    mutate("target_compound_label" = reorder(target_compound_label, fdr))
  
  d.label <- d.plot %>%
    group_by(target, compound_id) %>%
    mutate(y.position = (max(dependency) + 0.1*(max(dependency) - min(dependency)))) %>%
    dplyr::ungroup() %>%
    dplyr::select(target_compound_label, fdr, y.position) %>%
    distinct(target_compound_label, .keep_all = TRUE) %>%
    mutate(group1 = direction,
           group2 = "neither",
           fdr = scientific(fdr, 3)) 
  # d.label
  
  p <-  ggplot(d.plot, aes(x = {{outlier_var}}, y = dependency)) +
    geom_hline(yintercept = 0) +
    geom_jitter(aes(color = {{outlier_var}}), width = 0.3, alpha = 0.6) +
    geom_boxplot(aes(fill = {{outlier_var}}), outlier.shape = NA, coef = 0) +
    stat_pvalue_manual(d.label, label = paste0("FDR = {fdr}"), tip.length = 0.03, size = 3, vjust = -0.5) +
    ## adds extra space on top & bottom so p-value label will fit
    scale_y_continuous(expand = expansion(mult = 0.15)) +
    # scale_color_manual(values = c("low" = "blue", "neither" = "gray50", "high" = "red")) +
    scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
    scale_fill_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
    labs(x = "Cell_viability",
         y = "Expression_group") +
    theme_bw() +
    theme(aspect.ratio = 0.75,
          axis.text = element_text(color = "black"),
          axis.ticks = element_line(color = "black"),
          legend.position = "none") +
    facet_wrap(~target_compound_label, scales = "free_y",
                        ncol = 1)
  
  q <- ggplot(d.plot, aes(x = target, y = {{expr_var}})) +
    geom_jitter(aes(color = {{outlier_var}}), width = 0.3) +
    geom_boxplot(alpha = 0.3, outlier.shape = NA, coef = 0) +
    # scale_color_manual(values = c("low" = "blue", "neither" = "gray50", "high" = "red")) +
    scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
    labs(x = "Target", y = "log2(TPM)") +
    theme_bw() +
    theme(aspect.ratio = 1,
          axis.text = element_text(color = "black"),
          axis.ticks = element_line(color = "black")) +
    facet_wrap(~target_compound_label, scale = "free", ncol = 1)
  
  ggarrange(p, q, ncol = 2, align = "hv", 
            common.legend = TRUE, legend = "right")

}
make_outlier_plot(d.drug_sens_all_genes_expr_outlier_low_wrst_resist,
                  c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035"), "low", expr_outlier_flag, log_tpm)

Compare this output to that from the linear model-based approach:

selected_top_lm_hits
## `geom_smooth()` using formula 'y ~ x'

Plot lineage

What does the expression of each target gene look like across cell lineages?

make_lineage_plot <- function(df, plot_list, y_var, color_flag){

  d.plot <- df %>% 
    ungroup() %>%
    unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
    filter(target_compound_label %in% plot_list) %>%
    mutate("target_compound_label" = reorder(target_compound_label, fdr),
           "target" = reorder(target, fdr)) 

  ggplot(d.plot, aes(x = lineage, y = {{y_var}})) +
    geom_jitter(aes(color = {{color_flag}}), width = 0.3, size = 1) +
    geom_boxplot(alpha = 0, outlier.shape = NA, coef = 0) +
    scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
    labs(x = "Cell_lineage", y = "log2(TPM)") +
    theme_bw() +
    theme(axis.text = element_text(color = "black"),
          axis.ticks = element_line(color = "black"),
          axis.text.x = element_text(angle = 45, hjust=1)) +
    facet_wrap(~target, scales = "free_y",
               ncol = 1)
}
make_lineage_plot(d.drug_sens_all_genes_expr_outlier_low_wrst_resist, 
                  c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035"), log_tpm, expr_outlier_flag)

Paralogs

PRISM Phase 1 drug sensitivity

## add target col so single-gene functions will work
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  mutate(target = sorted_gene_pair)
# d.DeKegel_top5_pairs_drug_sens_expr

Continuous/linear regression

## A1 expr

## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::glance(lm(dependency ~ A1_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  rename(r_squared = r.squared, p_val = p.value)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary

## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary)
# d.stats

d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.stats %>%
  dplyr::select(-p_val) %>%
  right_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
  dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary

## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::tidy(lm(dependency ~ A1_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  ## extract slope of lm fit line from output
  filter(term == "A1_log_tpm") %>%
  rename(slope = estimate) %>%
  dplyr::select(target, compound_id, slope) 
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope

## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope, by = c("target", "compound_id")) %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm))

summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm 
## # target/compound pairs with
## p < 0.05: 481 
## p < 0.01: 249 
## FDR < 0.1: 251 
## FDR < 0.05: 142
## A2 expr

## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::glance(lm(dependency ~ A2_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  rename(r_squared = r.squared, p_val = p.value)
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary

## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary)
# d.stats

d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.stats %>%
  dplyr::select(-p_val) %>%
  right_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
  dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary

## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::tidy(lm(dependency ~ A2_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  ## extract slope of lm fit line from output
  filter(term == "A2_log_tpm") %>%
  rename(slope = estimate) %>%
  dplyr::select(target, compound_id, slope) 
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope

## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope, by = c("target", "compound_id")) %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm))

summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm 
## # target/compound pairs with
## p < 0.05: 340 
## p < 0.01: 156 
## FDR < 0.1: 128 
## FDR < 0.05: 74
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
  group_by(slope_flag) %>%
  summarize(n = n()) %>%
  print_kbl()
slope_flag n
negative 1343
positive 1039
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
  group_by(slope_flag) %>%
  summarize(n = n()) %>%
  print_kbl()
slope_flag n
negative 1222
positive 1160
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope 
## # target/compound pairs with
## p < 0.05: 120 
## p < 0.01: 48 
## FDR < 0.1: 48 
## FDR < 0.05: 29
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope 
## # target/compound pairs with
## p < 0.05: 361 
## p < 0.01: 201 
## FDR < 0.1: 203 
## FDR < 0.05: 113
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope 
## # target/compound pairs with
## p < 0.05: 158 
## p < 0.01: 82 
## FDR < 0.1: 67 
## FDR < 0.05: 39
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope 
## # target/compound pairs with
## p < 0.05: 182 
## p < 0.01: 74 
## FDR < 0.1: 61 
## FDR < 0.05: 35
Plot A1 top hits
A1_top_hits <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope %>%
  dplyr::select(target, compound_name, r_squared, fdr) %>%
  distinct(target, compound_name, .keep_all = TRUE) %>%
  arrange(fdr) 

print_kbl(head(A1_top_hits, n = 20))
target compound_name r_squared fdr
FYN_SRC vandetanib 0.0659658 0.0000014
FYN_SRC saracatinib 0.0572548 0.0000035
ATP1A1_ATP1A2 k-strophanthidin 0.0587694 0.0000035
ATP1A1_ATP1A3 k-strophanthidin 0.0587694 0.0000035
ATP1A1_ATP1A4 k-strophanthidin 0.0587694 0.0000035
FYN_YES1 saracatinib 0.0572548 0.0000035
FYN_SRC PD-168393 0.0534604 0.0000095
ARAF_RAF1 AZ-628 0.0376702 0.0004196
MAPK11_MAPK14 tyrphostin-AG-1478 0.0336892 0.0008928
FYN_SRC bosutinib 0.0326309 0.0011196
FYN_YES1 bosutinib 0.0326309 0.0011196
MAP2K1_MAP2K2 arctigenin 0.0301968 0.0024172
CDK2_CDK3 NU6027 0.0258299 0.0066929
CDK2_CDK5 NU6027 0.0258299 0.0066929
EPAS1_HIF1A BAY-87-2243 0.0245436 0.0097530
EPAS1_HIF1A 2-methoxyestradiol 0.0239446 0.0107025
NR1I2_VDR erlotinib 0.0219698 0.0137919
CAMK2D_CAMK2G bosutinib 0.0215397 0.0151139
ESR1_ESR2 ethynodiol-diacetate 0.0209548 0.0180253
GSK3A_GSK3B SB-203580 0.0209268 0.0181552
A1_top_hits_plot_list <- A1_top_hits %>%
  arrange(fdr) %>%
  unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  head(n = 9) %>%
  pull(target_compound_label)
# A1_top_hits_plot_list

make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope, A1_top_hits_plot_list, A1_log_tpm)

# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope
Plot A2 top hits
A2_top_hits <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope %>%
  dplyr::select(target, compound_name, r_squared, fdr) %>%
  distinct(target, compound_name, .keep_all = TRUE) %>%
  arrange(fdr) 

print_kbl(head(A2_top_hits, n = 20))
target compound_name r_squared fdr
AKT1_AKT3 GSK2110183 0.0982682 0.0000000
AKT2_AKT3 GSK2110183 0.0982682 0.0000000
AKT1_AKT3 MK-2206 0.0873418 0.0000000
AKT2_AKT3 MK-2206 0.0873418 0.0000000
AKT1_AKT3 GDC-0068 0.0607472 0.0000013
AKT2_AKT3 GDC-0068 0.0607472 0.0000013
MAP2K1_MAP2K2 bosutinib 0.0474871 0.0000464
AKT1_AKT3 AZD5363 0.0474116 0.0000464
AKT2_AKT3 AZD5363 0.0474116 0.0000464
ABL1_ABL2 saracatinib 0.0444061 0.0000956
PIK3R1_PIK3R2 CUDC-907 0.0410053 0.0003328
CDK2_CDK5 bosutinib 0.0371005 0.0006195
PTPN11_PTPN6 BVT-948 0.0264080 0.0107209
CDK1_CDK2 NU6027 0.0258299 0.0111549
HDAC4_HDAC5 romidepsin 0.0246434 0.0157450
EPAS1_HIF1A BAY-87-2243 0.0243860 0.0160324
ROCK1_ROCK2 SB-203580 0.0236855 0.0187571
MAP3K2_MAP3K3 bosutinib 0.0233228 0.0195743
AKT1_AKT2 A-674563 0.0221977 0.0259942
PPP3CA_PPP3CB cyclosporin-a 0.0240152 0.0260890
A2_top_hits_plot_list <- A2_top_hits %>%
  arrange(fdr) %>%
  unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  head(n = 9) %>%
  pull(target_compound_label)
# A1_top_hits_plot_list

make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope, A2_top_hits_plot_list, A2_log_tpm)

# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope
Write output df
# write_paralog_lm_output_df <- function(df, col_name, stat_cutoff){
#   
#   df_out <- df %>%
#     filter({{col_name}} < stat_cutoff) %>%
#     arrange({{col_name}}) %>%
#     distinct(target, compound_id, .keep_all = TRUE) %>%
#     dplyr::select(target, compound_name, r_squared:slope, moa)
#   
#   df_name <- deparse(substitute(df))
#   stat_name <- deparse(substitute(col_name))
#   
#   # df_name <- paste0(deparse(substitute(df)), "_",
#   #                   deparse(substitute(col_name)), "_",
#   #                   "L", stat_cutoff, ".txt")
#   # print(df_name)
#   # print(stat_name)
#   
#   # print(nrow(df_out))
#   
#   out_path <- file.path(out_dir, paste0(df_name, "_", stat_name, "_", "L", stat_cutoff, ".txt"))
#   # print(out_path)
#   
#   write_tsv(df_out, out_path)
#   
#   cat("\ndf written to", out_path)
# }
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope,
#                            fdr, 0.1)
# 
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope,
#                            fdr, 0.1)
# 
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope,
#                            fdr, 0.1)
# 
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope,
#                            fdr, 0.1)

Outliers for top lm hits

## A2 top hits
target_list <- c("AKT1_AKT2", "AKT2_AKT3", "AKT1_AKT3", "PTPN11_PTPN6", "CDK2_CDK5", "CDK1_CDK2", "MAP2K1_MAP2K2")
compound_list <- c("GSK2110183", "MK−2206", "GDC−0068", "AZD5363", "BVT_948", "bosutinib", "NU6027", "canertinib")

## robustly scale expression data
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(target %in% target_list) %>%
  filter(grepl(str_c(compound_list, collapse = "|"), compound_name)) %>%
  filter(!is.na(A2_log_tpm) | !is.na(dependency)) %>%
  group_by(target, compound_id) %>%
  mutate(scaled_A2_log_tpm = RobScale(A2_log_tpm, center = TRUE, scale = TRUE)) %>%
  mutate(A2_expr_outlier_flag = case_when(
    scaled_A2_log_tpm < -1.5 ~ "low",
    scaled_A2_log_tpm > 1.5 ~ "high",
    TRUE ~ "neither")) %>%
  mutate(scaled_A1_log_tpm = RobScale(A1_log_tpm, center = TRUE, scale = TRUE)) %>%
  mutate(A1_expr_outlier_flag = case_when(
    scaled_A1_log_tpm < -1.5 ~ "low",
    scaled_A1_log_tpm > 1.5 ~ "high",
    TRUE ~ "neither")) %>%
  ungroup() %>%
  mutate(A2_expr_outlier_flag = factor(A2_expr_outlier_flag, levels = c("low", "neither", "high")))%>%
  mutate(A1_expr_outlier_flag = factor(A1_expr_outlier_flag, levels = c("low", "neither", "high")))
# d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered %>%
  group_by(target, compound_id) %>%
  ## get rid of drug/targets that don't have any "low" cell lines
  filter(any(A2_expr_outlier_flag == "low")) %>%
  ungroup() 

d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  group_by(target, compound_id, A2_expr_outlier_flag) %>%
  ## calculate median dependency score for each outlier group
  summarize(grp_med_dependency = median(dependency)) %>%
  pivot_wider(names_from = A2_expr_outlier_flag, values_from = grp_med_dependency) %>%
  ## calculate difference between low and neither groups
  mutate(grp_med_diff = low - neither) %>%
  dplyr::select(target, compound_id, grp_med_diff) 

d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff, by = c("target", "compound_id"))

## low outliers and drug sensitivity
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  filter(A2_expr_outlier_flag != "high") %>%
  group_by(target, compound_id) %>%
  summarize(p_val = wilcox.test(dependency ~ A2_expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_low_wrst_sens_pvals

## adjust for multiple testing overall
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals)
## Adding missing grouping variables: `target`
# d.stats

d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  left_join(d.stats, by = c("target", "compound_id")) 

## filter for hits where low-expressing cell lines are more sensitive to the drug
## (low group has lower median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
  filter(grp_med_diff < 0)

## filter for hits where low-expressing ell lines are more resistant to the drug
## (low group has higher median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
  filter(grp_med_diff > 0)

summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens 
## # target/compound pairs with
## p < 0.05: 5 
## p < 0.01: 5 
## FDR < 0.1: 5 
## FDR < 0.05: 5
summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist 
## # target/compound pairs with
## p < 0.05: 0 
## p < 0.01: 0 
## FDR < 0.1: 0 
## FDR < 0.05: 0
Plot
# d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens

plot_list <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens %>%
  unite("target_compound_id", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  distinct(target_compound_id) %>%
  pull(target_compound_id)

make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list[1:4], "low", A2_expr_outlier_flag, A2_log_tpm)

make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list[5:8], "low", A2_expr_outlier_flag, A2_log_tpm)

make_lineage_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list, A2_log_tpm, A2_expr_outlier_flag)

write_rds(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens, 
          file.path(out_dir, "d.top_outlier_pairs"))